In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import calendar
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import accuracy_score
In [3]:
import statsmodels.tsa.stattools as sts
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.graphics.tsaplots as tsa
import statsmodels.stats.diagnostic as diag
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, kpss
from scipy import signal
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pmdarima.arima import ADFTest
from pmdarima.arima import auto_arima
In [55]:
from gluonts.dataset.common import ListDataset
from gluonts.model.deepar import DeepAREstimator
from gluonts.mx.trainer import Trainer
from gluonts.evaluation import make_evaluation_predictions, Evaluator
from gluonts.dataset.util import to_pandas
In [4]:
import fbprophet
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_components_plotly
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
In [5]:
plt.rcParams["figure.figsize"] = (12,6)
In [6]:
df = pd.read_csv("hotel_bookings.csv")
In [7]:
df.head(5)
Out[7]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.0 0 1 Check-Out 2015-07-03

5 rows × 32 columns

In [8]:
df.shape
Out[8]:
(119390, 32)
In [9]:
df.isna().sum()
Out[9]:
hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company                           112593
days_in_waiting_list                   0
customer_type                          0
adr                                    0
required_car_parking_spaces            0
total_of_special_requests              0
reservation_status                     0
reservation_status_date                0
dtype: int64
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal                            119390 non-null  object 
 13  country                         118902 non-null  object 
 14  market_segment                  119390 non-null  object 
 15  distribution_channel            119390 non-null  object 
 16  is_repeated_guest               119390 non-null  int64  
 17  previous_cancellations          119390 non-null  int64  
 18  previous_bookings_not_canceled  119390 non-null  int64  
 19  reserved_room_type              119390 non-null  object 
 20  assigned_room_type              119390 non-null  object 
 21  booking_changes                 119390 non-null  int64  
 22  deposit_type                    119390 non-null  object 
 23  agent                           103050 non-null  float64
 24  company                         6797 non-null    float64
 25  days_in_waiting_list            119390 non-null  int64  
 26  customer_type                   119390 non-null  object 
 27  adr                             119390 non-null  float64
 28  required_car_parking_spaces     119390 non-null  int64  
 29  total_of_special_requests       119390 non-null  int64  
 30  reservation_status              119390 non-null  object 
 31  reservation_status_date         119390 non-null  object 
dtypes: float64(4), int64(16), object(12)
memory usage: 29.1+ MB

Preprocessing the data

In [11]:
df["children"].fillna(0, inplace = True)
In [12]:
df["children"] = df["children"].astype(int)

Converting to datetime

In [13]:
df = df.rename(columns = {"arrival_date_year" : "year", "arrival_date_day_of_month" : "day"})
In [14]:
df["month"] = df["arrival_date_month"].apply(lambda x : datetime.strptime((x[0:3]), "%b").month)
In [15]:
df["datetime"] = pd.to_datetime(df[["year", "month", "day"]])

Dropping the rows which have no guests

In [16]:
filter = (df["adults"] == 0) & (df["children"] == 0) & (df["babies"] == 0)
In [17]:
df = df[~filter]
In [18]:
df.shape
Out[18]:
(119210, 34)
In [19]:
df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"])
In [20]:
df["Total Guests"] = df["adults"] +  df["children"] + df["babies"]
In [21]:
df["Total stay"] = df["stays_in_week_nights"] +  df["stays_in_weekend_nights"]
In [22]:
df[df["adults"] == 0][["adults", "children", "babies"]]
Out[22]:
adults children babies
40984 0 3 0
41048 0 2 0
41446 0 2 0
41952 0 2 0
45158 0 2 0
... ... ... ...
117204 0 2 0
117274 0 2 0
117303 0 2 0
117453 0 2 0
118200 0 3 0

223 rows × 3 columns

In [23]:
data = df.copy()
In [24]:
data = data.set_index("datetime")
In [25]:
data = data.sort_index()
In [26]:
data.shape
Out[26]:
(119210, 35)

Seggregating between resort hotel and city hotel

In [27]:
rh = data[data["hotel"] == "Resort Hotel"]
ch = data[data["hotel"] == "City Hotel"]
In [28]:
rh_canceled = rh[rh["is_canceled"] == 1]
rh_checked = rh[rh["is_canceled"] == 0]
ch_canceled = ch[ch["is_canceled"] == 1]
ch_checked = ch[ch["is_canceled"] == 0]

Getting the number of guests that actually arrive according to date for resort and city hotel

In [29]:
rh_ts = rh_checked[["Total Guests"]]
rh_ts = rh_ts.groupby(rh_ts.index).sum()
In [30]:
ch_ts = ch_checked[["Total Guests"]]
ch_ts = ch_ts.groupby(ch_ts.index).sum()

Sampling from daily to weekly

In [31]:
rh_ts_weekly = rh_ts.resample("W").sum()
ch_ts_weekly = ch_ts.resample("W").sum()

Getting the number of bookings per date

In [32]:
rh_num = pd.DataFrame(rh.groupby(rh.index).size(), columns = ["Number_of_Bookings"])
In [33]:
ch_num = pd.DataFrame(ch.groupby(ch.index).size(), columns = ["Number_of_Bookings"])
In [34]:
rh_num.head(5)
Out[34]:
Number_of_Bookings
datetime
2015-07-01 43
2015-07-02 44
2015-07-03 40
2015-07-04 50
2015-07-05 45
In [35]:
ch_num.head(5)
Out[35]:
Number_of_Bookings
datetime
2015-07-01 79
2015-07-02 49
2015-07-03 16
2015-07-04 38
2015-07-05 8

Below are certains functions for analysis before forecasting like

1) checking for white noise,

2) stationarity test (aduller and kpss)

3) splitting and determing the trend and seasonality

4) Visualizing the rolling mean and standard deviation

In [36]:
def visualize_time_series(df, col_name):
    fig = px.line(df, x=df.index, y=col_name)
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        )
    )
    fig.show()

The Ljung Box test is a way to test for the absence of serial autocorrelation, up to a specified lag k. The test determines whether or not errors are white noise or whether there is something more behind them.

If p-value less than significance level, then it is not pure white noise.

In [37]:
def check_for_white_noise(df):
    autocorrelation_plot(df)
    plt.show()
    tsa.plot_acf(df, lags = 40, alpha = 0.05, title = 'Auto correlation plot')
    a = diag.acorr_ljungbox(df, lags = [40], boxpierce = True)
    print(a)
In [38]:
def decomposition(df):
#     multiplicative_decomposition =  seasonal_decompose(df["dp_Value"], model = "multiplicative", extrapolate_trend = "freq")
    additive_decomposition = seasonal_decompose(df, model = "additive", extrapolate_trend = "freq", freq = 7)
    plt.rcParams.update({'figure.figsize' : (16,12)})
#     multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize = 16)
    additive_decomposition.plot().suptitle("Additive Decomposition", fontsize = 16)
    plt.show()
In [39]:
def plot_rolling_mean(df, col_name, window):
    rolling_mean = df.rolling(window = window).mean()
    rolling_std = df.rolling(window = window).std()
    fig = go.Figure()
    trace1 = go.Scatter(x = df.index, y = df[col_name], mode = "lines", name = col_name)
    trace2 = go.Scatter(x = rolling_mean.index, y = rolling_mean[col_name], mode = "lines", name = "Rolling Mean")
    trace3 = go.Scatter(x = rolling_std.index, y = rolling_std[col_name], mode = "lines", name = "Rolling Standard Deviation")
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.add_trace(trace3)
    fig.update_layout(width = 1100)
    fig.show()
In [40]:
def adf_test(df):
    result = adfuller(df, autolag = "AIC")
    dfoutput = pd.Series(result[0:4], index=['Test Statistic','p-value',' Lags Used','Number of Observations Used'])
    print('The test statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('%s: %.3f' % (key, value))
    if result[1]<0.05:
        print("The p-value is less than 0.05, hence we can reject the null hypothesis and say that the time series is stationary")
    else:
        print("The series is not stationary")
In [41]:
def kpss_test(df):
    result = kpss(df, regression='c')
    print('\nKPSS Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    for key, value in result[3].items():
        print('Critial Values:')
        print(f' \t\t {key}: \t {value}')
    print("Result: The series is {} stationary".format("not" if result[1]<0.05 else ""))

Helper functions to get train and test set, plot forecast, get evaluation

In [42]:
def get_train_test(df, split):
    split = split
    df_train = df[0:-split]
    df_test = df[-split:]
    return(df_train, df_test)
In [70]:
def forecast_plotly_data(self, training_data, prediction_intervals=(50.0, 90.0), show_mean=False, color="b", label=None, output_file=None, *args, **kwargs,):
    fig = go.Figure()

    label_prefix = "" if label is None else label + "-"

    for c in prediction_intervals:
        assert 0.0 <= c <= 100.0

    ps = [50.0] + [
        50.0 + f * c / 2.0
        for c in prediction_intervals
        for f in [-1.0, +1.0]
    ]
    percentiles_sorted = sorted(set(ps))
    
    def alpha_for_percentile(p):
        return (p / 100.0) ** 1.5
    
    ps_data = [self.quantile(p / 100.0) for p in percentiles_sorted]
    i_p50 = len(percentiles_sorted) // 2
    
    p50_data = ps_data[i_p50]
    p50_series = pd.Series(data=p50_data, index=self.index)
    
    data = to_pandas(training_data)[-365:]
    
    trace1 = go.Scatter(
        name = "Device Values",
        x = data.index,
        y = data,
        mode = "lines",
        line=dict(
        color='darkgreen',
        width=1
    )
#         line = dict(color = 'rgb(0,10,255)'),
    )
    
    fig.add_trace(trace1)
    
    trace2 = go.Scatter(
        name='Forecast Median',
        x=p50_series.index,
        y=p50_series,
        mode='lines',
        line=dict(
        color='orange',
        width=2
    )
#         line=dict(color='rgb(255, 0, 0)'),
    )
    
    fig.add_trace(trace2)
    opacity_colour = ['rgba(250, 0, 0, 0.2)', 'rgba(250, 0, 0, 0.4)']
    for i in range(len(percentiles_sorted) // 2):
        ptile = percentiles_sorted[i]
        alpha = alpha_for_percentile(ptile)
        x = p50_series.index
        y_lower = ps_data[i]
        y_upper = ps_data[-i-1]
        fig.add_trace(go.Scatter(
        name=f"{label_prefix}{100 - ptile * 2}%",
        x=x,
        y=y_upper,
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False
    )),
        fig.add_trace(go.Scatter(
            name=f"{label_prefix}{100 - ptile * 2}%",
            x=x,
            y=y_lower,
            marker=dict(color="#444"),
            line=dict(width=0),
            mode='lines',
            fillcolor=opacity_colour[i],
            fill='tonexty',
            showlegend=True
        ))
    return(fig)
In [77]:
def get_evaluation_gluonts(test_data, predictor):
    forecast_it, ts_it = make_evaluation_predictions(
    dataset=test_data,  # test dataset
    predictor=predictor,  # predictor
    num_samples=100,  # number of sample paths we want for evaluation
    )

    forecasts = list(forecast_it)
    tss = list(ts_it)

    evaluator = Evaluator()
    agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=1)
    return(agg_metrics, item_metrics)

Forecasting algorithms used :

A) Deep AR (Deep Autoregressive Neural Networks - Gluonts library)

B) Auto ARIMA

C) Prophet forecasting (By Facebook)

Forecasting for Resort Hotel

In [45]:
visualize_time_series(rh_num, "Number_of_Bookings")
Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016Jan 2017Apr 2017Jul 2017050100150200
1m6mYTD1yallNumber_of_Bookingsdatetime
In [46]:
check_for_white_noise(rh_num["Number_of_Bookings"])
(array([215.72165948]), array([6.01299012e-26]), array([211.02647626]), array([4.15980807e-25]))

Since p-value is less than 0.05, it is not white noise

In [47]:
decomposition(rh_num["Number_of_Bookings"])
In [48]:
plot_rolling_mean(rh_num,"Number_of_Bookings",30)
Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016Jan 2017Apr 2017Jul 2017050100150200
Number_of_BookingsRolling MeanRolling Standard Deviation
In [49]:
adf_test(rh_num["Number_of_Bookings"])
The test statistic: -4.069195
p-value: 0.001089
Critical Values:
1%: -3.439
5%: -2.865
10%: -2.569
The p-value is less than 0.05, hence we can reject the null hypothesis and say that the time series is stationary

1) Deep - AR Forecasting

Checking for missing dates

In [52]:
pd.date_range(start = rh_num.index.values[0], end = rh_num.index.values[-1]).difference(rh_num.index)
Out[52]:
DatetimeIndex([], dtype='datetime64[ns]', freq=None)
In [53]:
rh_train, rh_test = get_train_test(rh_num, 60)

Converting data into training format for gluonts

In [56]:
rh_training_data = ListDataset(
    [{"start": rh_train.index[0], "target": rh_train["Number_of_Bookings"]}],
    freq = "D"
)
In [57]:
rh_test_data = ListDataset(
    [{'start' : rh_num.index[0], "target" : rh_num["Number_of_Bookings"]}],
    freq = "D"
)

Predicting for 60 days

In [63]:
prediction_length = 60
In [64]:
estimator = DeepAREstimator(freq="D", prediction_length=prediction_length, trainer=Trainer(epochs=30, num_batches_per_epoch = 50, learning_rate = 1e-3,batch_size = 32))
predictor = estimator.train(training_data=rh_training_data)
  0%|                                                                                           | 0/50 [00:00<?, ?it/s]
learning rate from ``lr_scheduler`` has been overwritten by ``learning_rate`` in optimizer.
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.50it/s, epoch=1/30, avg_epoch_loss=4.75]
100%|█████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.75it/s, epoch=2/30, avg_epoch_loss=4.47]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.23it/s, epoch=3/30, avg_epoch_loss=4.42]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.24it/s, epoch=4/30, avg_epoch_loss=4.38]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.79it/s, epoch=5/30, avg_epoch_loss=4.33]
100%|█████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.81it/s, epoch=6/30, avg_epoch_loss=4.27]
100%|██████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.90it/s, epoch=7/30, avg_epoch_loss=4.2]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.12it/s, epoch=8/30, avg_epoch_loss=4.14]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.15it/s, epoch=9/30, avg_epoch_loss=4.07]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.33it/s, epoch=10/30, avg_epoch_loss=4.02]
100%|███████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.30it/s, epoch=11/30, avg_epoch_loss=4]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.57it/s, epoch=12/30, avg_epoch_loss=3.93]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.62it/s, epoch=13/30, avg_epoch_loss=3.92]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.53it/s, epoch=14/30, avg_epoch_loss=3.84]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.52it/s, epoch=15/30, avg_epoch_loss=3.81]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.61it/s, epoch=16/30, avg_epoch_loss=3.74]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.60it/s, epoch=17/30, avg_epoch_loss=3.74]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.56it/s, epoch=18/30, avg_epoch_loss=3.65]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.82it/s, epoch=19/30, avg_epoch_loss=3.62]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.48it/s, epoch=20/30, avg_epoch_loss=3.57]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.08it/s, epoch=21/30, avg_epoch_loss=3.53]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.22it/s, epoch=22/30, avg_epoch_loss=3.5]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.52it/s, epoch=23/30, avg_epoch_loss=3.47]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.22it/s, epoch=24/30, avg_epoch_loss=3.4]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.07it/s, epoch=25/30, avg_epoch_loss=3.38]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.04it/s, epoch=26/30, avg_epoch_loss=3.35]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.26it/s, epoch=27/30, avg_epoch_loss=3.34]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.86it/s, epoch=28/30, avg_epoch_loss=3.29]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.58it/s, epoch=29/30, avg_epoch_loss=3.27]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.67it/s, epoch=30/30, avg_epoch_loss=3.22]
In [79]:
rh_forecast = predictor.predict(rh_training_data)
In [80]:
rh_forecast_list = list(rh_forecast)
In [81]:
prediction_interval = [50.0,98.0]
fig = forecast_plotly_data(rh_forecast_list[0], list(rh_test_data)[0], prediction_intervals = prediction_interval)
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(count = 9, label = "9m", step = "month", stepmode = "backward"),
            dict(step="all")
        ])
    )
)
fig.update_layout(title = "Deep AR Forecasting for Number of Bookings - Resort Hotel", width = 1050)
fig.show()
Sep 2016Nov 2016Jan 2017Mar 2017May 2017Jul 201720406080100120140
50.0%98.0%Forecast MedianDevice Values1m6mYTD1y9mallDeep AR Forecasting for Number of Bookings - Resort Hotel
In [124]:
fig.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "w") as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))

Evaluating the model

In [83]:
forecast_it, ts_it = make_evaluation_predictions(
    dataset=rh_test_data,  # test dataset
    predictor=predictor,  # predictor
    num_samples=100,  # number of sample paths we want for evaluation
)
In [84]:
rh_forecasts = list(forecast_it)
rh_tss = list(ts_it)
In [85]:
evaluator = Evaluator()
agg_metrics, item_metrics = evaluator(iter(rh_tss), iter(rh_forecasts), num_series=1)
Running evaluation: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 71.56it/s]
In [86]:
rh_metrics_dict = {}
metrics_list = ['MSE','RMSE', 'MASE','MAPE','sMAPE']

for metric in metrics_list:
    rh_metrics_dict[metric] = agg_metrics.get(metric)
    print(metric + ": ", rh_metrics_dict[metric])
MSE:  300.1763020833333
RMSE:  17.325596730945037
MASE:  0.6019393695253314
MAPE:  0.24584970474243165
sMAPE:  0.2311933994293213

2) Auto - ARIMA Forecasting

In [58]:
def fit_auto_arima(df_train, df_test, m, hotel):
    arima_model = auto_arima(df_train, start_p = 1, d = 0, start_q = 1, max_p = 3, max_d = 2, max_q = 3, start_P = 0, D = 1, start_Q = 0, m = m, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True)
    arima_model.summary()
    prediction = pd.DataFrame(arima_model.predict(n_periods = len(df_test), index = df_test.index))
    prediction.columns = ['predicted_values']
    plt.figure(figsize = (14,4))
    plt.plot(df_train, label = "Training")
    plt.plot(df_test, label = "Test")
    plt.plot(df_test.index,prediction, label = "Predicted")
    plt.title(f"Auto ARIMA for number of bookings for {hotel}")
    plt.legend()
    plt.show()
    return(arima_model,prediction)
In [59]:
rh_train, rh_test = get_train_test(rh_num, 60)
In [60]:
arima_model, arima_pred = fit_auto_arima(rh_train, rh_test, 12, "resort")
Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,0)[12] intercept   : AIC=inf, Time=0.55 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=7094.283, Time=0.04 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=6913.754, Time=0.67 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.72 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=7092.323, Time=0.03 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=7092.444, Time=0.09 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=6861.054, Time=3.77 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=3.41 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.14 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=6860.374, Time=3.87 sec
 ARIMA(0,0,0)(1,1,0)[12] intercept   : AIC=6913.690, Time=0.71 sec
 ARIMA(0,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=2.37 sec
 ARIMA(0,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=0.81 sec
 ARIMA(0,0,1)(2,1,0)[12] intercept   : AIC=6861.095, Time=2.89 sec
 ARIMA(1,0,1)(2,1,0)[12] intercept   : AIC=inf, Time=7.73 sec
 ARIMA(0,0,0)(2,1,0)[12]             : AIC=6858.525, Time=0.43 sec
 ARIMA(0,0,0)(1,1,0)[12]             : AIC=6911.774, Time=0.17 sec
 ARIMA(0,0,0)(2,1,1)[12]             : AIC=inf, Time=1.85 sec
 ARIMA(0,0,0)(1,1,1)[12]             : AIC=inf, Time=0.80 sec
 ARIMA(1,0,0)(2,1,0)[12]             : AIC=6859.191, Time=0.78 sec
 ARIMA(0,0,1)(2,1,0)[12]             : AIC=6859.234, Time=0.70 sec
 ARIMA(1,0,1)(2,1,0)[12]             : AIC=inf, Time=5.83 sec

Best model:  ARIMA(0,0,0)(2,1,0)[12]          
Total fit time: 39.373 seconds
In [61]:
def performance_metrics_auto_arima(y_true, y_pred):
    print(f"The performance metrics are : ")
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)           
    print("Mean squared error is ", round(mse,4))
    print("Root mean squared error is ", round(rmse,4))
    print("Mean absolute error is ", round(mae,4))
    print("Mean absolute percentage error is ", round(mape*100,4))
In [62]:
performance_metrics_auto_arima(rh_test["Number_of_Bookings"], arima_pred.predicted_values)
The performance metrics are : 
Mean squared error is  236.0022
Root mean squared error is  15.3624
Mean absolute error is  12.9576
Mean absolute percentage error is  24.5079
In [ ]:
 

3) Prophet Forecasting

In [89]:
rh_m = Prophet()

Converting data into training format for Prophet algorithm

In [90]:
rh_prophet = rh_num.reset_index()
In [92]:
rh_prophet =  rh_prophet.rename(columns = {"datetime" : "ds", "Number_of_Bookings" : "y"})
In [93]:
rh_prophet.head(5)
Out[93]:
ds y
0 2015-07-01 43
1 2015-07-02 44
2 2015-07-03 40
3 2015-07-04 50
4 2015-07-05 45
In [94]:
prh_train, prh_test = get_train_test(rh_prophet, 60)
In [95]:
rh_m.fit(prh_train)
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Out[95]:
<fbprophet.forecaster.Prophet at 0x19099a4b448>
In [96]:
date_list = pd.date_range(start = prh_test.ds.iloc[0], end = prh_test.ds.iloc[-1])
future = pd.DataFrame(date_list, columns = ["ds"])
In [102]:
forecast_rh_prophet = rh_m.predict(future)

Funtion to plot the forecast

In [117]:
def plot_forecast_prophet_algo(df_train, df_test, forecast, title):
    fig = go.Figure()
    fig.add_trace(go.Scatter(
            name = "Number of Bookings",
            x = df_train.ds[-100:],
            y = df_train.y[-100:],
            mode = "lines",
            line=dict(
            color='blue',
            width=1
        )))
    fig.add_trace(go.Scatter(
            name = "Number of Bookings",
            x = df_test.ds,
            y = df_test.y,
            mode = "lines",
            line=dict(
            color='darkgreen',
            width=1
        )))
    opacity_colour = ['rgba(250, 0, 0, 0.2)', 'rgba(250, 0, 0, 0.4)']
    fig.add_trace(go.Scatter(
            name = "Predicted",
            x = forecast.ds,
            y = forecast.yhat,
            mode = "lines",
            line=dict(
            color='orange',
            width=1
        )))

    fig.add_trace(go.Scatter(
            name="Predicted_upper",
            x=forecast.ds,
            y=forecast.yhat_upper,
            mode='lines',
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False
    ))
    fig.add_trace(go.Scatter(
                name="Predicted_lower",
                x=forecast.ds,
                y=forecast.yhat_lower,
                marker=dict(color="#444"),
                line=dict(width=0),
                mode='lines',
                fillcolor=opacity_colour[0],
                fill='tonexty',
                showlegend=True
            ))

    fig.update_layout(title = f"Prophet Forecasting for Number of Bookings in Future Dates - {title}", width = 1050)
    fig.show()
    return(fig)
In [118]:
fig_prophet_rh = plot_forecast_prophet_algo(prh_train, prh_test, forecast_rh_prophet, "Resort")
Apr 2017May 2017Jun 2017Jul 2017Aug 201720406080100120140
Predicted_lowerPredictedNumber of BookingsNumber of BookingsProphet Forecasting for Number of Bookings in Future Dates - Resort
In [181]:
fig_prophet_rh.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "a") as f:
    f.write(fig_prophet_rh.to_html(full_html = False, include_plotlyjs='cdn'))

Evaluating the model

In [112]:
def performance_metrics_prophet(y_true, y_pred):
    print(f"The performance metrics are: ")
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    meae = median_absolute_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    accuracy_mape = (100-(mape*100))             
    print("Mean squared error is ", round(mse,4))
    print("Mean absolute error is ", round(mae,4))
    print("Root Mean Squared Error is ", round(rmse,4))
    print("Mean absolute percentage error is ", round(mape*100,4))               
In [113]:
y_true = prh_test.y.values
In [114]:
y_pred = forecast_rh_prophet.yhat.values
In [116]:
performance_metrics_prophet(y_true, y_pred)
The performance metrics are: 
Mean squared error is  135.2838
Mean absolute error is  9.466
Root Mean Squared Error is  11.6312
Mean absolute percentage error is  17.0035
In [ ]:
 

Forecasting for City Hotel

In [126]:
visualize_time_series(ch_num, "Number_of_Bookings")
Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016Jan 2017Apr 2017Jul 2017050100150200250300350
1m6mYTD1yallNumber_of_Bookingsdatetime
In [131]:
# check_for_white_noise(ch_num["Number_of_Bookings"])
In [128]:
decomposition(ch_num["Number_of_Bookings"])
In [129]:
plot_rolling_mean(ch_num,"Number_of_Bookings",30)
Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016Jan 2017Apr 2017Jul 2017050100150200250300350
Number_of_BookingsRolling MeanRolling Standard Deviation
In [130]:
adf_test(ch_num["Number_of_Bookings"])
The test statistic: -2.393138
p-value: 0.143683
Critical Values:
1%: -3.439
5%: -2.865
10%: -2.569
The series is not stationary
In [ ]:
 

1) Deep AR Forecasting

Checking for missing dates

In [142]:
pd.date_range(start = ch_num.index.values[0], end = ch_num.index.values[-1]).difference(ch_num.index)
Out[142]:
DatetimeIndex([], dtype='datetime64[ns]', freq=None)
In [143]:
ch_train, ch_test = get_train_test(ch_num, 60)

Converting data into training format for gluonts

In [144]:
ch_training_data = ListDataset(
    [{"start": ch_train.index[0], "target": ch_train["Number_of_Bookings"]}],
    freq = "D"
)
In [145]:
ch_test_data = ListDataset(
    [{'start' : ch_num.index[0], "target" : ch_num["Number_of_Bookings"]}],
    freq = "D"
)

Predicting for 60 days

In [146]:
prediction_length = 60
In [147]:
estimator_ch = DeepAREstimator(freq="D", prediction_length=prediction_length, trainer=Trainer(epochs=30, num_batches_per_epoch = 50, learning_rate = 1e-3,batch_size = 32))
predictor_ch = estimator_ch.train(training_data=ch_training_data)
  0%|                                                                                           | 0/50 [00:00<?, ?it/s]
learning rate from ``lr_scheduler`` has been overwritten by ``learning_rate`` in optimizer.
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.73it/s, epoch=1/30, avg_epoch_loss=5.46]
100%|█████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.52it/s, epoch=2/30, avg_epoch_loss=5.15]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.77it/s, epoch=3/30, avg_epoch_loss=5.07]
100%|████████████████████████████████████████████████████| 50/50 [00:07<00:00,  7.12it/s, epoch=4/30, avg_epoch_loss=5]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.80it/s, epoch=5/30, avg_epoch_loss=4.94]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.61it/s, epoch=6/30, avg_epoch_loss=4.89]
100%|██████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.21it/s, epoch=7/30, avg_epoch_loss=4.8]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.13it/s, epoch=8/30, avg_epoch_loss=4.73]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.24it/s, epoch=9/30, avg_epoch_loss=4.66]
100%|█████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.33it/s, epoch=10/30, avg_epoch_loss=4.6]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.29it/s, epoch=11/30, avg_epoch_loss=4.56]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.31it/s, epoch=12/30, avg_epoch_loss=4.49]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.17it/s, epoch=13/30, avg_epoch_loss=4.43]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.56it/s, epoch=14/30, avg_epoch_loss=4.39]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.01it/s, epoch=15/30, avg_epoch_loss=4.35]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.82it/s, epoch=16/30, avg_epoch_loss=4.31]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.79it/s, epoch=17/30, avg_epoch_loss=4.28]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.12it/s, epoch=18/30, avg_epoch_loss=4.21]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.35it/s, epoch=19/30, avg_epoch_loss=4.19]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.38it/s, epoch=20/30, avg_epoch_loss=4.16]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.39it/s, epoch=21/30, avg_epoch_loss=4.11]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.82it/s, epoch=22/30, avg_epoch_loss=4.07]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.68it/s, epoch=23/30, avg_epoch_loss=4.09]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.93it/s, epoch=24/30, avg_epoch_loss=4.03]
100%|████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.57it/s, epoch=25/30, avg_epoch_loss=3.97]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.80it/s, epoch=26/30, avg_epoch_loss=3.94]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.87it/s, epoch=27/30, avg_epoch_loss=3.94]
100%|████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.41it/s, epoch=28/30, avg_epoch_loss=3.94]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.17it/s, epoch=29/30, avg_epoch_loss=3.88]
100%|████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.94it/s, epoch=30/30, avg_epoch_loss=3.85]
In [148]:
ch_forecast = predictor.predict(ch_training_data)
In [149]:
ch_forecast_list = list(ch_forecast)
In [153]:
prediction_interval = [50.0,99.0]
fig = forecast_plotly_data(ch_forecast_list[0], list(ch_test_data)[0], prediction_intervals = prediction_interval)
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(count = 9, label = "9m", step = "month", stepmode = "backward"),
            dict(step="all")
        ])
    )
)
fig.update_layout(title = "Deep AR Forecasting for Number of Bookings - City Hotel", width = 1050)
fig.show()
Sep 2016Nov 2016Jan 2017Mar 2017May 2017Jul 201750100150200250300
50.0%99.0%Forecast MedianDevice Values1m6mYTD1y9mallDeep AR Forecasting for Number of Bookings - City Hotel
In [154]:
fig.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "a") as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))

Evaluating the model

In [155]:
forecast_it, ts_it = make_evaluation_predictions(
    dataset=rh_test_data,  # test dataset
    predictor=predictor,  # predictor
    num_samples=100,  # number of sample paths we want for evaluation
)
In [156]:
ch_forecasts = list(forecast_it)
ch_tss = list(ts_it)
In [157]:
evaluator = Evaluator()
agg_metrics, item_metrics = evaluator(iter(ch_tss), iter(ch_forecasts), num_series=1)
Running evaluation: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 100.29it/s]
In [158]:
ch_metrics_dict = {}
metrics_list = ['MSE','RMSE', 'MASE','MAPE','sMAPE']

for metric in metrics_list:
    rh_metrics_dict[metric] = agg_metrics.get(metric)
    print(metric + ": ", rh_metrics_dict[metric])
MSE:  303.0056640625
RMSE:  17.407057880713214
MASE:  0.6045490456321023
MAPE:  0.24690375328063965
sMAPE:  0.2319811503092448
In [ ]:
 
In [ ]:
 

2) Auto ARIMA Forecasting

In [134]:
def fit_auto_arima(df_train, df_test, m, hotel):
    arima_model = auto_arima(df_train, start_p = 1, d = 0, start_q = 1, max_p = 3, max_d = 2, max_q = 3, start_P = 0, D = 1, start_Q = 0, m = m, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True)
    arima_model.summary()
    prediction = pd.DataFrame(arima_model.predict(n_periods = len(df_test), index = df_test.index))
    prediction.columns = ['predicted_values']
    plt.figure(figsize = (14,4))
    plt.plot(df_train, label = "Training")
    plt.plot(df_test, label = "Test")
    plt.plot(df_test.index,prediction, label = "Predicted")
    plt.title(f"Auto ARIMA for number of bookings for {hotel}")
    plt.legend()
    plt.show()
    return(arima_model,prediction)
In [136]:
ch_train, ch_test = get_train_test(ch_num, 60)
In [139]:
arima_model_ch, arima_pred_ch = fit_auto_arima(ch_train, ch_test, 12, "City")
Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,0)[12] intercept   : AIC=inf, Time=0.58 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=8110.789, Time=0.03 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=7899.011, Time=1.30 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.88 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=8109.150, Time=0.03 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=8110.329, Time=0.10 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=7849.953, Time=3.88 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=4.69 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.53 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=7856.496, Time=5.22 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=7851.115, Time=4.93 sec
 ARIMA(1,0,1)(2,1,0)[12] intercept   : AIC=inf, Time=5.56 sec
 ARIMA(0,0,1)(2,1,0)[12] intercept   : AIC=7850.460, Time=2.98 sec
 ARIMA(2,0,1)(2,1,0)[12] intercept   : AIC=inf, Time=7.94 sec
 ARIMA(1,0,0)(2,1,0)[12]             : AIC=7849.266, Time=0.84 sec
 ARIMA(1,0,0)(1,1,0)[12]             : AIC=7897.796, Time=0.34 sec
 ARIMA(1,0,0)(2,1,1)[12]             : AIC=inf, Time=3.02 sec
 ARIMA(1,0,0)(1,1,1)[12]             : AIC=inf, Time=1.94 sec
 ARIMA(0,0,0)(2,1,0)[12]             : AIC=7856.124, Time=0.49 sec
 ARIMA(2,0,0)(2,1,0)[12]             : AIC=7850.341, Time=1.12 sec
 ARIMA(1,0,1)(2,1,0)[12]             : AIC=inf, Time=6.38 sec
 ARIMA(0,0,1)(2,1,0)[12]             : AIC=7849.816, Time=0.69 sec
 ARIMA(2,0,1)(2,1,0)[12]             : AIC=inf, Time=6.55 sec

Best model:  ARIMA(1,0,0)(2,1,0)[12]          
Total fit time: 61.008 seconds
In [140]:
def performance_metrics_auto_arima(y_true, y_pred):
    print(f"The performance metrics are : ")
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)           
    print("Mean squared error is ", round(mse,4))
    print("Root mean squared error is ", round(rmse,4))
    print("Mean absolute error is ", round(mae,4))
    print("Mean absolute percentage error is ", round(mape*100,4))
In [141]:
performance_metrics_auto_arima(ch_test["Number_of_Bookings"], arima_pred_ch.predicted_values)
The performance metrics are : 
Mean squared error is  1325.3545
Root mean squared error is  36.4054
Mean absolute error is  30.7094
Mean absolute percentage error is  32.7301
In [ ]:
 

3) Prophet Forecasting

In [159]:
ch_m = Prophet()

Converting data into training format for Prophet algorithm

In [163]:
ch_prophet = ch_num.reset_index()
In [164]:
ch_prophet =  ch_prophet.rename(columns = {"datetime" : "ds", "Number_of_Bookings" : "y"})
In [165]:
ch_prophet.head(5)
Out[165]:
ds y
0 2015-07-01 79
1 2015-07-02 49
2 2015-07-03 16
3 2015-07-04 38
4 2015-07-05 8
In [166]:
pch_train, pch_test = get_train_test(ch_prophet, 60)
In [167]:
ch_m.fit(pch_train)
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Out[167]:
<fbprophet.forecaster.Prophet at 0x19098f37ec8>
In [168]:
date_list = pd.date_range(start = pch_test.ds.iloc[0], end = pch_test.ds.iloc[-1])
future = pd.DataFrame(date_list, columns = ["ds"])
In [169]:
forecast_ch_prophet = ch_m.predict(future)

Funtion to plot the forecast

In [180]:
fig_prophet_ch = plot_forecast_prophet_algo(pch_train, pch_test, forecast_ch_prophet, "City")
Apr 2017May 2017Jun 2017Jul 2017Aug 201750100150200250300
Predicted_lowerPredictedNumber of BookingsNumber of BookingsProphet Forecasting for Number of Bookings in Future Dates - City
In [175]:
fig_prophet_ch.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "a") as f:
    f.write(fig_prophet_ch.to_html(full_html = False, include_plotlyjs='cdn'))

Evaluating the model

In [177]:
y_true = pch_test.y.values
In [178]:
y_pred = forecast_ch_prophet.yhat.values
In [179]:
performance_metrics_prophet(y_true, y_pred)
The performance metrics are: 
Mean squared error is  1198.8528
Mean absolute error is  27.9716
Root Mean Squared Error is  34.6245
Mean absolute percentage error is  29.3397
In [ ]:
 
In [ ]: